

import urllib.request
import json

# 正则
import re

import ssl


url = "https://www.qiushibaike.com/text/page/1/"


request = urllib.request.Request(url)
request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36")

# 创建 ssl 证书
ssl_context = ssl._create_unverified_context()
response = urllib.request.urlopen(request, context=ssl_context)

html = response.read().decode("utf-8")
#print(html)

filepath = r"E:\python\spider-file\file3.html"
with open(filepath, "w", encoding="utf-8") as file:
    file.write(str(html))


pat = r'<div class="author clearfix">(.*?)<span class="stats-vote">'
re_joke = re.compile(pat, re.S)
divList = re_joke.findall(str(html))

#print(divList)

dic = {}
for div in divList:
    usernamePat = r'<h2>(.*?)</h2>'
    username_joke = re.compile(usernamePat, re.S)
    usernameList = username_joke.findall(div)
    if not usernameList:
        print("+++++++++++++++++usernameList is None")
    username = usernameList.pop(0).strip()

    contentPat = r'<div class="content">\n<span>(.*?)</span>(\n\n|\n)</div>\n</a>'
    content_joke = re.compile(contentPat, re.S)
    contentList = content_joke.findall(div)
    if not contentList:
        print("+++++++++++++++++contentList is None")
    content = str(contentList[0]).strip()

    dic[username] = content
    print("username = %s, content = %s" % (username, content))
